all_annotations <-
here::here("data", "4b-handle-long-annotations", "cleaned_annotations.csv") %>%
read.csv() %>%
rename(pandas_index = X) %>%
mutate(
annotation_id = factor(annotation_id),
annotator = factor(
annotator,
levels = c("cborgelt", "danielE", "danielG", "katharina", "sascha"),
labels = c("Christian", "Daniel E.", "Daniel G.", "Katharina", "Sascha")
),
line_id = factor(line_id),
label = factor(
label,
levels = c("EVENT", "LOC", "MISC", "ORG", "PER", "TIME"),
labels = c("Event", "Location", "Miscellaneous", "Organisation", "Person", "Time")
),
annotation_length = end - start
) %>%
filter(annotation_length <= 100)
all_annotations %>%
select(-pandas_index, -annotation_id) %>%
datatable()
union_annotations <-
here::here("data", "5a-generate-union-dataset", "union_dataset.csv") %>%
read.csv() %>%
rename(pandas_index = X) %>%
mutate(
annotation_id = factor(annotation_id),
line_id = factor(line_id),
label = factor(label,
levels = c("EVENT", "LOC", "MISC", "ORG", "PER", "TIME"),
labels = c("Event", "Location", "Miscellaneous", "Organisation", "Person", "Time")
),
annotation_length = end - start
)
union_annotations %>% select(-pandas_index, -annotation_id)
doccano_colour_scheme_technical <- c(
'ATTENTION'= '#f44e3b',
'EVENT'= '#0062b1',
'LOC'= '#7b64ff',
'MISC'= '#a79c4f',
'ORG'= '#16a5a5',
'PER'= '#fcdc00',
'Personal Bookmark'= '#fda1ff',
'POSTCORR'= '#9f0500',
'TIME'= '#73d8ff',
'?'= '#cccccc'
)
doccano_colour_scheme_named <- c(
'Event'= '#0062b1',
'Location'= '#7b64ff',
'Miscellaneous'= '#a79c4f',
'Organisation'= '#16a5a5',
'Person'= '#fcdc00',
'Time'= '#73d8ff'
)
annotator_colour_scheme_technical = c(
'danielE' = '#ff7f00',
'katharina' = '#e4211c',
'cborgelt' = '#377eb8',
'danielG' = '#984ea3',
'sascha' = '#4daf4a'
)
annotator_colour_scheme_named = c(
'Daniel E.' = '#ff7f00',
'Katharina' = '#e4211c',
'Christian' = '#377eb8',
'Daniel G.' = '#984ea3',
'Sascha' = '#4daf4a'
)
# create pie chart: https://r-graph-gallery.com/piechart-ggplot2.html
# relative frequencies: https://homepages.gac.edu/~anienow2/MCS_142/R/ggplot2-barchart.html --> set aes(x = "")
# manual category colours: https://statisticsglobe.com/change-colors-of-bars-in-ggplot2-barchart-in-r
# change legend's title: https://www.geeksforgeeks.org/how-to-change-legend-title-in-ggplot2-in-r/
global_label_length <-
union_annotations %>%
# compute annotation length per category
group_by(label) %>%
summarise(
count = n(),
total_length = sum(annotation_length),
median_length = median(annotation_length),
mean_length = mean(annotation_length),
sd_length = sd(annotation_length)
) %>%
mutate(
label = factor(label)
)
# order plots by global annotation length: https://sarahleejane.github.io/learning/r/2014/09/17/ordering-factors-by-another-column-with-R.html
global_label_length$label <-
reorder(
global_label_length$label,
global_label_length$total_length
)
global_label_length
global_label_length %>%
# transform data
pivot_longer(
cols = c(count, total_length, mean_length, median_length),
names_to = "measure"
) %>%
mutate(
measure = factor(
measure,
levels = c("count", "total_length", "mean_length", "median_length"),
labels = c("Count", "Total Annotation Length", "Average Annotation Length", "Median Annotation Length")
)
) %>%
# sort nicely for debugging
arrange(measure, label) %>%
select(measure, label, value) %>%
# plot data
ggplot(aes(x = measure, y = value, fill = label)) +
geom_bar(
stat = "identity",
position = "fill",
col = I("black")
) +
scale_fill_manual(values = doccano_colour_scheme_named) +
scale_y_continuous(labels = scales::percent) +
labs(x = "", y = "Realative Frequency", fill = "Label") +
coord_flip() +
theme_classic()
Here’s the same thing as a pie chart.
global_label_length %>%
ggplot(aes(x = "", y = total_length, fill = label)) +
geom_bar(
stat = "identity",
width = 1,
col = I("black")
) +
coord_polar("y", start = 0) +
# make it pretty
labs(fill = "Label") +
scale_fill_manual(values = doccano_colour_scheme_named) +
theme_void()
# https://stackoverflow.com/questions/54523133/adding-overall-group-to-facet-wrap-ggplot2#comment95851202_54523970 --> add "overall" column by setting
# facet_grid(~label, margins = TRUE)
# --------------
all_annotations %>%
ggplot(aes(x = annotation_length, fill = label)) +
geom_histogram(binwidth = 10, col = I("black")) +
facet_grid(~label) +
labs(
x = "Annotation Length",
y = "Count"
) +
scale_fill_manual(values = doccano_colour_scheme_named) +
guides(fill = FALSE) +
theme_bw()
all_annotations %>%
ggplot(aes(x = label, y = annotation_length, fill = label)) +
geom_boxplot() +
labs(
x = "Label",
y = "Annotation Length",
fill = "Label"
) +
scale_fill_manual(values = doccano_colour_scheme_named) +
theme_bw()
annotator_contributions <-
all_annotations %>%
# compute annotation length per category
group_by(annotator) %>%
summarise(
count = n(),
total_length = sum(annotation_length),
median_length = median(annotation_length),
mean_length = mean(annotation_length),
sd_length = sd(annotation_length)
)
annotator_contributions
annotator_contributions %>%
# transform data
pivot_longer(
cols = c(count, total_length, mean_length, median_length),
names_to = "measure"
) %>%
mutate(
measure = factor(
measure,
levels = c("count", "total_length", "mean_length", "median_length"),
labels = c("Count", "Total Annotation Length", "Average Annotation Length", "Median Annotation Length")
)
) %>%
# sort nicely for debugging
arrange(measure, annotator) %>%
select(measure, annotator, value) %>%
# plot data
ggplot(aes(x = measure, y = value, fill = annotator)) +
geom_bar(
stat = "identity",
position = "fill",
col = I("black")
) +
scale_y_continuous(labels = scales::percent) +
labs(x = "", y = "Realative Frequency", fill = "Annotator") +
scale_fill_manual(values = annotator_colour_scheme_named) +
coord_flip() +
theme_classic()
all_annotations %>%
ggplot(aes(x = annotation_length, fill = annotator)) +
geom_histogram(
binwidth = 10,
col = I("black")
) +
facet_grid(~annotator) +
labs(
x = "Annotation Length",
y = "Count"
) +
scale_fill_manual(values = annotator_colour_scheme_named) +
guides(fill = FALSE) +
theme_bw()
all_annotations %>%
ggplot(aes(x = annotator, y = annotation_length, fill = annotator)) +
geom_boxplot() +
labs(
x = "Annotator",
y = "Annotation Length",
fill = "Annotator"
) +
scale_fill_manual(values = annotator_colour_scheme_named) +
theme_bw()
label_annotator <- all_annotations %>%
group_by(annotator, label) %>%
summarise(
total_length = sum(annotation_length),
count = n(),
.groups = 'drop_last'
) %>%
mutate(
label = reorder(
factor(label),
global_label_length$total_length
)
)
# stacked bar chart: https://r-graph-gallery.com/48-grouped-barplot-with-ggplot2 --> position = "fill"
# plot it as stacked bar chart
label_annotator %>%
ggplot(aes(x = annotator, y = total_length, fill = label)) +
geom_bar(
stat = "identity",
position = "fill",
col = I("black")
) +
# make it pretty
labs(x = "Annotator", y = "Relative Label Frequency", fill = "Label") +
scale_fill_manual(values = doccano_colour_scheme_named) +
scale_y_continuous(labels = scales::percent) +
theme_classic()
# plot it as stacked bar chart
label_annotator %>%
ggplot(aes(x = label, y = total_length, fill = annotator)) +
geom_bar(
stat = "identity",
position = "fill",
col = I("black")
) +
# make it pretty
labs(x = "Label", y = "Relative Label Frequency", fill = "Annotator") +
scale_fill_manual(values = annotator_colour_scheme_named) +
scale_y_continuous(labels = scales::percent) +
theme_classic()
all_annotations %>%
ggplot(aes(x = annotation_length)) +
geom_histogram(binwidth = 10) +
facet_grid(annotator~label) +
labs(
x = "Annotation Length",
y = "Count"
) +
guides(fill = FALSE) +
theme_bw()
all_annotations %>%
ggplot(aes(x = "", y = annotation_length)) +
geom_boxplot() +
facet_grid(annotator~label) +
labs(
x = "",
y = "Annotation Length"
) +
theme_bw()